In [ ]:
##################################################################################################
## Notebook used for extracting text from html files. Some basic preprocessing tasks
## v2.0 Preprocessing Text
## - Stopword removal
## - Stemming/ Lemmatization
##
## Required packages: os, BeautifulSoup, nltk, ,wordcloud
## The following is used from the nltk packages: corpoira/stopwords, SnowballStemmer, WordNetLemmatizer
##
##################################################################################################
In [ ]:
import os
from bs4 import BeautifulSoup as bs
In [ ]:
# Function created for removing HTML tags from a document
def clean_html(htmlDoc):
soup = bs(htmlDoc, 'html.parser') # Parses text so that html tags can be extracted
for script in soup(["script", "style","title",'[document]', 'head', 'title']):
script.extract()
cleaned=str(soup.get_text(separator=' ').encode('ascii','ignore'))
return cleaned.strip()
In [ ]:
## Read all the html files and open the first speech
rootDir = 'E:\\NLP Session\\RBIGovernorSpeeches\\'
htmlFiles = [f for f in os.listdir(rootDir) if f.endswith('.html')]
fileName = rootDir + htmlFiles[0]
cleanedtext = clean_html(open(fileName))
In [ ]:
###########################################################################################################################
## Second paragraph from the speech
###########################################################################################################################
text1 = 'Over the last few weeks, I have outlined the RBI’s approach to inflation, distressed debt, financial inclusion, banking sector reform, and market reform. Today, I’d like to first discuss why central banking is not as easy as it appears (just raise or cut interest rates!) and why it needs decisions, sometimes unpopular or hard-to-explain ones, to be made under conditions of extreme uncertainty. This will then lead in to my arguments about why we need an independent central bank.'
print text1
In [ ]:
#from nltk.corpus import stopwords
#stopWords = set(stopwords.words('mystopwords'))
stopWords = [line.replace('\n', '') for line in open('stopwords') ]
stopWords
In [ ]:
## Use a lambda function to lower the text, tokenize it and remove it from the corpus if it belongs to the stopwords set
##
filter(lambda w: not w in stopWords,text1.lower().split())
In [ ]:
## There are some non- alphanumeric tokens in the text, removing them from the corpus
##
text2 = ''.join(w for w in text1 if (w.isalnum() or w ==' '))
text2
In [ ]:
## Removing stopwords from the text corpus
##
filter(lambda w: not w in stopWords,text2.lower().split())
In [ ]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
clean_text = filter(lambda w: not w in stopWords,text2.lower().split()) ### << Stop word removal
stemmed_words = [stemmer.stem(word) for word in clean_text]
stemmed_words
In [ ]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
clean_text = filter(lambda w: not w in stopWords,text2.lower().split())
lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in clean_text]
lemmatized_words
In [ ]:
##################################################################################################
## Drawing a wordcloud using the wordcloud package
##
##################################################################################################
from os import path
from wordcloud import WordCloud
# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
def drawWordcloud(text):
# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
#wc_array = WordCloud.to_array(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
return wordcloud
In [ ]:
fileName = rootDir + htmlFiles[0]
fileName
In [ ]:
fileName = rootDir + htmlFiles[0]
text = clean_html(open(fileName))
clean_text = filter(lambda w: not w in stopWords,text.lower().split())
clean_text = ' '.join(clean_text)
clean_text
In [ ]:
drawWordcloud(clean_text)
In [ ]:
## Running the same code for a different speech
##
fileName = rootDir + htmlFiles[4]
text = clean_html(open(fileName))
clean_text = filter(lambda w: not w in stopWords,text.lower().split())
clean_text = ' '.join(clean_text)
drawWordcloud(clean_text)
In [ ]: